{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPU Extreme gradient boosting trained on TF-IDF reduced 50 dimensions\n",
    "\n",
    "1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)\n",
    "2. Same splitting 80% training, 20% testing, may vary depends on randomness\n",
    "3. Same regex substitution '[^\\\"\\'A-Za-z0-9 ]+'\n",
    "\n",
    "## Example\n",
    "\n",
    "Based on Term-frequency Inverse document frequency\n",
    "\n",
    "After that we apply SVD to reduce the dimensions, n_components = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import xgboost as xgb\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "import numpy as np\n",
    "import re\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import sklearn.datasets\n",
    "from sklearn import pipeline\n",
    "from sklearn.model_selection import StratifiedKFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "# because of sklean.datasets read a document as a single element\n",
    "# so we want to split based on new line\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        # python3, if python2, just remove list()\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset_data.data, trainset_data.target = separate_dataset(trainset_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(trainset_data.data, trainset_data.target, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "decompose = pipeline.Pipeline([('count', TfidfVectorizer()),\n",
    "                               ('svd', TruncatedSVD(n_components=50))]).fit(trainset_data.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params_xgd = {\n",
    "    'min_child_weight': 10.0,\n",
    "    'objective': 'multi:softprob',\n",
    "    'eval_metric': 'mlogloss',\n",
    "    'num_class': len(trainset_data.target_names),\n",
    "    'max_depth': 7,\n",
    "    'max_delta_step': 1.8,\n",
    "    'colsample_bytree': 0.4,\n",
    "    'subsample': 0.8,\n",
    "    'eta': 0.03,\n",
    "    'gamma': 0.65,\n",
    "    'num_boost_round' : 700,\n",
    "    'gpu_id': 0,\n",
    "    'tree_method': 'gpu_hist'\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_X = decompose.transform(train_X)\n",
    "test_X = decompose.transform(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-mlogloss:1.78223\tvalid-mlogloss:1.78259\n",
      "Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.\n",
      "\n",
      "Will train until valid-mlogloss hasn't improved in 100 rounds.\n",
      "[50]\ttrain-mlogloss:1.5574\tvalid-mlogloss:1.57311\n",
      "[100]\ttrain-mlogloss:1.49457\tvalid-mlogloss:1.52355\n",
      "[150]\ttrain-mlogloss:1.46417\tvalid-mlogloss:1.50584\n",
      "[200]\ttrain-mlogloss:1.44302\tvalid-mlogloss:1.4967\n",
      "[250]\ttrain-mlogloss:1.42599\tvalid-mlogloss:1.49095\n",
      "[300]\ttrain-mlogloss:1.4111\tvalid-mlogloss:1.48669\n",
      "[350]\ttrain-mlogloss:1.39748\tvalid-mlogloss:1.48334\n",
      "[400]\ttrain-mlogloss:1.38507\tvalid-mlogloss:1.48065\n",
      "[450]\ttrain-mlogloss:1.3733\tvalid-mlogloss:1.47826\n",
      "[500]\ttrain-mlogloss:1.36218\tvalid-mlogloss:1.47627\n",
      "[550]\ttrain-mlogloss:1.3517\tvalid-mlogloss:1.47462\n",
      "[600]\ttrain-mlogloss:1.3416\tvalid-mlogloss:1.47312\n",
      "[650]\ttrain-mlogloss:1.3319\tvalid-mlogloss:1.47175\n",
      "[700]\ttrain-mlogloss:1.32257\tvalid-mlogloss:1.47055\n",
      "[750]\ttrain-mlogloss:1.31332\tvalid-mlogloss:1.46951\n",
      "[800]\ttrain-mlogloss:1.30438\tvalid-mlogloss:1.46853\n",
      "[850]\ttrain-mlogloss:1.29571\tvalid-mlogloss:1.46762\n",
      "[900]\ttrain-mlogloss:1.28722\tvalid-mlogloss:1.46692\n",
      "[950]\ttrain-mlogloss:1.27865\tvalid-mlogloss:1.466\n",
      "[1000]\ttrain-mlogloss:1.2703\tvalid-mlogloss:1.46529\n",
      "[1050]\ttrain-mlogloss:1.26222\tvalid-mlogloss:1.46462\n",
      "[1100]\ttrain-mlogloss:1.25412\tvalid-mlogloss:1.46393\n",
      "[1150]\ttrain-mlogloss:1.24628\tvalid-mlogloss:1.46336\n",
      "[1200]\ttrain-mlogloss:1.23838\tvalid-mlogloss:1.46291\n",
      "[1250]\ttrain-mlogloss:1.23068\tvalid-mlogloss:1.46241\n",
      "[1300]\ttrain-mlogloss:1.22302\tvalid-mlogloss:1.46196\n",
      "[1350]\ttrain-mlogloss:1.21537\tvalid-mlogloss:1.46156\n",
      "[1400]\ttrain-mlogloss:1.20791\tvalid-mlogloss:1.46119\n",
      "[1450]\ttrain-mlogloss:1.20057\tvalid-mlogloss:1.46089\n",
      "[1500]\ttrain-mlogloss:1.19344\tvalid-mlogloss:1.46059\n",
      "[1550]\ttrain-mlogloss:1.18621\tvalid-mlogloss:1.46027\n",
      "[1600]\ttrain-mlogloss:1.17919\tvalid-mlogloss:1.46001\n",
      "[1650]\ttrain-mlogloss:1.17216\tvalid-mlogloss:1.45981\n",
      "[1700]\ttrain-mlogloss:1.1653\tvalid-mlogloss:1.45963\n",
      "[1750]\ttrain-mlogloss:1.15829\tvalid-mlogloss:1.45947\n",
      "[1800]\ttrain-mlogloss:1.1515\tvalid-mlogloss:1.45937\n",
      "[1850]\ttrain-mlogloss:1.14457\tvalid-mlogloss:1.45917\n",
      "[1900]\ttrain-mlogloss:1.13782\tvalid-mlogloss:1.45899\n",
      "[1950]\ttrain-mlogloss:1.13112\tvalid-mlogloss:1.45888\n",
      "[2000]\ttrain-mlogloss:1.12465\tvalid-mlogloss:1.45878\n",
      "[2050]\ttrain-mlogloss:1.11823\tvalid-mlogloss:1.45861\n",
      "[2100]\ttrain-mlogloss:1.11185\tvalid-mlogloss:1.45857\n",
      "[2150]\ttrain-mlogloss:1.10546\tvalid-mlogloss:1.45857\n",
      "[2200]\ttrain-mlogloss:1.09918\tvalid-mlogloss:1.45854\n",
      "[2250]\ttrain-mlogloss:1.09285\tvalid-mlogloss:1.45861\n",
      "Stopping. Best iteration:\n",
      "[2190]\ttrain-mlogloss:1.1004\tvalid-mlogloss:1.45851\n",
      "\n"
     ]
    }
   ],
   "source": [
    "d_train = xgb.DMatrix(train_X, train_Y)\n",
    "d_valid = xgb.DMatrix(test_X, test_Y)\n",
    "watchlist = [(d_train, 'train'), (d_valid, 'valid')]\n",
    "#with open('clf.p', 'rb') as fopen:\n",
    "#    clf = pickle.load(fopen)\n",
    "clf = xgb.train(params_xgd, d_train, 100000, watchlist, early_stopping_rounds=100, maximize=False, verbose_eval=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.42349031933015041"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(test_Y == np.argmax(clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit), axis = 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.34      0.07      0.11     11336\n",
      "       fear       0.30      0.06      0.10      9694\n",
      "        joy       0.46      0.73      0.56     28068\n",
      "       love       0.17      0.01      0.03      6987\n",
      "    sadness       0.40      0.54      0.46     24277\n",
      "   surprise       0.07      0.00      0.01      3000\n",
      "\n",
      "avg / total       0.37      0.42      0.35     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(test_Y, np.argmax(clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit), axis = 1), target_names = trainset_data.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.42361027806434587"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.save_model('xgb-tfidf-svd50.model')\n",
    "bst = xgb.Booster(params_xgd)\n",
    "bst.load_model('xgb-tfidf-svd50.model')\n",
    "import json\n",
    "with open('xgb-tfidf-svd50-param', 'w') as fopen:\n",
    "    fopen.write(json.dumps(params_xgd))\n",
    "np.mean(test_Y == np.argmax(bst.predict(xgb.DMatrix(test_X)), axis = 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
